Here we will simulate two symptoms which give rise to a disease. The symptoms are not heritable. However, there are genetic effects on the D matrix in the LDL decomposition of \(\Sigma\). We control the heritability of D. We also make a couple more simplifying design choices: \(L\) is lower triangular and contains ones only so the covariance of the symptoms depends only on \(D_{1,1}\).

N <- 1000
l <- 100
reps <- 5
p <- 2
hercovs <- seq(0,1,0.2)
her <- 0

L <- matrix(0,nrow=p,ncol=p)
L[lower.tri(L,diag=T)] <- 1
diag(L) <- 1

rows <- data.frame()

for (r in c(1:reps)){
  print(r)
  for (hercov in hercovs){
    print(hercov)
    
    G <- simulate_genotypes(N = N,L = l)
    G <- scale(G)

    beta_cov <- matrix(mvrnorm(n=l,mu = rep(0,p),Sigma = diag(p)),nrow=l) %*% diag(3,p,p) * sqrt(hercov / l)
    e_cov <- matrix(mvrnorm(n=N,mu = rep(0,p),Sigma = diag(p)),nrow=N) %*% diag(3,p,p) * sqrt(1-hercov)
    D_vectors <- G %*% beta_cov + e_cov + matrix(10,nrow=N,ncol=p)
    D_vectors[D_vectors<0] <- 0.1
    
    beta <- matrix(mvrnorm(n=l,mu = rep(0,p),Sigma = diag(5,p,p)),nrow=l) %*% diag(1,p,p) * sqrt(her / l)
    e <- matrix(mvrnorm(n=N,mu = rep(0,p),Sigma = diag(5,p,p)),nrow=N) * sqrt(1-her)
    
    X <- matrix(0,nrow=N,ncol=p)
    # Give each person a covariance matrix and draw symptoms
    for (ind in c(1:N)){
        # make into matrix
        D_vectors[ind,2] <- 1 # setting this to one to control the correlation
        Sig_ind <- L %*% diag(D_vectors[ind,]) %*% t(L)
        beta_transformed <- t(chol(Sig_ind)) %*% t(beta) %>% t() # check this
        e_transformed <- t(chol(Sig_ind)) %*% e[ind,] %>% t() # check this
        X[ind,] <- G[ind,] %*% beta_transformed + e_transformed
    }
    
    # setting the threshold to keep constant prevalence
    c <- 1
    n <- 1
    Y <- apply(X,1,mdd_risk,threshold=c,n=n)
    prev <- sum(Y)/N
    # calculating heritabilities
    h2D <- greml(D_vectors[,1],G)$h2
    h2IP <- greml(X[,1] * X[,2],G)$h2
    h2Y <- greml(Y,G)$h2
    
    rows <- rbind(rows,data.frame("h2_symptoms"=her,
                              "h2_Y"=h2Y,
                              "h2_IP"=h2IP,
                              "h2_D"=h2D,
                              "prev"=sum(Y),
                              "rep"=r,
                              "P"=p,
                              "c"=c,
                              "n"=n,
                              "hercov"=hercov))
    
  }
}
## [1] 1
## [1] 0
## [1] 0.2
## [1] 0.4
## [1] 0.6
## [1] 0.8
## [1] 1
## [1] 2
## [1] 0
## [1] 0.2
## [1] 0.4
## [1] 0.6
## [1] 0.8
## [1] 1
## [1] 3
## [1] 0
## [1] 0.2
## [1] 0.4
## [1] 0.6
## [1] 0.8
## [1] 1
## [1] 4
## [1] 0
## [1] 0.2
## [1] 0.4
## [1] 0.6
## [1] 0.8
## [1] 1
## [1] 5
## [1] 0
## [1] 0.2
## [1] 0.4
## [1] 0.6
## [1] 0.8
## [1] 1
rows %>% 
  pivot_longer(cols=c("h2_D","h2_Y","h2_IP"),names_to = "feature",values_to = "GREML heritability") %>%
  ggplot(aes(x=hercov,col=feature,y=`GREML heritability`))+
  geom_point() +
  geom_smooth(method="lm",se=F,linetype=2,linewidth=0.5,alpha=0.5)
## `geom_smooth()` using formula = 'y ~ x'

labs(x="covariance heritability (heritability of D)")
## $x
## [1] "covariance heritability (heritability of D)"
## 
## attr(,"class")
## [1] "labels"

We can see that the entries of D are heritable, but the disease and individual-level product are not very heritable. Why?

Let’s do a rerun of the last simulation results where the covariance heritability was 1, but add more individuals and loci.

N <- 2000
l <- 100
hercov <- 1

G <- simulate_genotypes(N = N,L = l)
G <- scale(G)

beta_cov <- matrix(mvrnorm(n=l,mu = rep(0,p),Sigma = diag(p)),nrow=l) %*% diag(3,p,p) * sqrt(hercov / l)
e_cov <- matrix(mvrnorm(n=N,mu = rep(0,p),Sigma = diag(p)),nrow=N) %*% diag(3,p,p) * sqrt(1-hercov)
D_vectors <- G %*% beta_cov + e_cov + matrix(10,nrow=N,ncol=p)
D_vectors[D_vectors<0] <- 0.1

beta <- matrix(mvrnorm(n=l,mu = rep(0,p),Sigma = diag(5,p,p)),nrow=l) %*% diag(1,p,p) * sqrt(her / l)
e <- matrix(mvrnorm(n=N,mu = rep(0,p),Sigma = diag(5,p,p)),nrow=N) * sqrt(1-her)

X <- matrix(0,nrow=N,ncol=p)
# Give each person a covariance matrix and draw symptoms
for (ind in c(1:N)){
    # make into matrix
    D_vectors[ind,2] <- 1 # setting this to one to control the correlation
    Sig_ind <- L %*% diag(D_vectors[ind,]) %*% t(L)
    beta_transformed <- t(chol(Sig_ind)) %*% t(beta) %>% t() # check this
    e_transformed <- t(chol(Sig_ind)) %*% e[ind,] %>% t() # check this
    X[ind,] <- G[ind,] %*% beta_transformed + e_transformed
}

# setting the threshold to keep constant prevalence
c <- 5
n <- 1
Y <- apply(X,1,mdd_risk,threshold=c,n=n)

We’re going to calculate and normalise the individual-level product as they do in CLIP, as \(IP=\frac{(x_1-\bar{x_1})(x_2-\bar{x_2})}{\sqrt{{x_1}{x_2}}}\).

phenos <- data.frame(X,Y,D=D_vectors[,1]) %>%
  mutate(IP12=((X1-mean(X1))* (X2-mean(X2)))/sqrt(var(X1)*var(X2))) # normalised IP

Let’s visualise how the symptoms, and the IP, relates to the disease. s

ggplot(phenos) + geom_point(aes(x=X1,y=X2,col=IP12),shape=16) + scale_color_viridis_c()

ggplot(phenos) + geom_point(aes(x=X1,y=X2,col=Y),shape=16) 

ggplot(phenos) + geom_point(aes(x=D,y=IP12,col=Y)) 

And now we’ll run GWAS on: * the entries of D: this should be easy, and we should get the covariance QTLs * the IP: might be trickier, but there should be some signal * the disease: we hope to have some signal from the covariance QTLs… * the symptoms: these are not directly heritable, so hopefully just noise

apply(G,2,function(x) lm(phenos$IP12 ~ x)$coefficients[2]) -> GWAS_IP
apply(G,2,function(x) lm(phenos$D ~ x)$coefficients[2]) -> GWAS_D
apply(G,2,function(x) lm(phenos$X1 ~ x)$coefficients[2]) -> GWAS_X1
apply(G,2,function(x) lm(phenos$X2 ~ x)$coefficients[2]) -> GWAS_X2
apply(G,2,function(x) lm(phenos$Y ~ x)$coefficients[2]) -> GWAS_Y
data.frame(beta=beta_cov[,1],
           beta_hat_D=GWAS_D,
           beta_hat_IP=GWAS_IP,
           beta_hat_Y=GWAS_Y,
           beta_hat_X1=GWAS_X1,
           beta_hat_X2=GWAS_X2
           ) -> GWAS

How do the estimated effect sizes compare to the real ones and to each other?

GWAS %>% ggplot(aes(x=beta,y=beta_hat_D)) + geom_point() +geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

GWAS %>% ggplot(aes(x=beta,y=beta_hat_IP)) + geom_point() +geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

GWAS %>% ggplot(aes(x=beta,y=beta_hat_X1)) + geom_point() +geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

GWAS %>% ggplot(aes(x=beta,y=beta_hat_X2)) + geom_point() +geom_smooth(method="lm") 
## `geom_smooth()` using formula = 'y ~ x'

GWAS %>% ggplot(aes(x=beta,y=beta_hat_Y)) + geom_point() +geom_smooth(method="lm") 
## `geom_smooth()` using formula = 'y ~ x'

So GWAS is good at finding D entries, but not so good at finding individual-level products.

GWAS %>% cor() %>% ggcorrplot(method="circle")

This correlation plot shows how the effect size estimates are correlated across the different phenotypes available. If we look at the beta row, we see that there is signal in the estimated effect sizes on D and to some extent on the IP and Y, but not on the symptoms. This is what we want!

If we look at the beta_hat_Y row, we see that the signal is shared across the symptoms (makes sense, since the disease is a function of the symptoms…?) and a little across the covariance QTLs.

What if the effects are on L?

Now I will model genetic effects on the bottom left entry of L, which controls the covariance: \(cov(X_1,X_2)=L_{2,1}D_1\) and the variance of \(X_2\): \(Var(X_2)=L_1^2D_1 + D_2\). The entries of D will be constant, and I will keep them at 1.

D <- diag(1,2,2)
L <- matrix(0,nrow=p,ncol=p)
L[lower.tri(L,diag=T)] <- 1
diag(L) <- 1

beta_cov <- matrix(mvrnorm(n=l,mu = rep(0,1),Sigma = diag(1)),nrow=l) %*% diag(2,1,1) * sqrt(hercov / l)
e_cov <- matrix(mvrnorm(n=N,mu = rep(0,1),Sigma = diag(1)),nrow=N) %*% diag(2,1,1) * sqrt(1-hercov)
L_vectors <- G %*% beta_cov + e_cov

beta <- matrix(mvrnorm(n=l,mu = rep(0,p),Sigma = diag(5,p,p)),nrow=l) %*% diag(1,p,p) * sqrt(her / l)
e <- matrix(mvrnorm(n=N,mu = rep(0,p),Sigma = diag(5,p,p)),nrow=N) * sqrt(1-her)

XL <- matrix(0,nrow=N,ncol=p)
# Give each person a covariance matrix and draw symptoms
for (ind in c(1:N)){
    # make into matrix
    L_ind <- L
    L_ind[2,1] <- L_vectors[ind]
    Sig_ind <- L_ind %*% D %*% t(L_ind)
    beta_transformed <- t(chol(Sig_ind)) %*% t(beta) %>% t() # check this
    e_transformed <- t(chol(Sig_ind)) %*% e[ind,] %>% t() # check this
    XL[ind,] <- G[ind,] %*% beta_transformed + e_transformed
}

# setting the threshold to keep constant prevalence
c <- 1
n <- 1
YL <- apply(XL,1,mdd_risk,threshold=c,n=n)
prev <- sum(YL)/N
# calculating heritabilities
h2L <- greml(L_vectors,G)$h2
h2IP <- greml(XL[,1] * XL[,2],G)$h2
h2Y <- greml(YL,G)$h2
print(h2L)
## [1] 1.404324
print(h2IP)
## [1] 1.275842
print(h2Y)
## [1] 1.829318

Note how the heritability of the individual-level product, and of the disease are much higher now.

phenosL <- data.frame(XL,Y=YL,L=L_vectors) %>%
  mutate(IP12=((X1-mean(X1))* (X2-mean(X2)))/sqrt(var(X1)*var(X2))) # normalised IP

Plotting various things:

ggplot(phenosL) + geom_point(aes(x=X1,y=X2,col=IP12),shape=16) + scale_color_viridis_c()

ggplot(phenosL) + geom_point(aes(x=X1,y=X2,col=L),shape=16) + scale_color_viridis_c()

ggplot(phenosL) + geom_point(aes(x=X1,y=X2,col=Y),shape=16) 

ggplot(phenosL) + geom_point(aes(x=L,y=IP12,col=Y)) 

What an odd pattern of symptoms.

apply(G,2,function(x) lm(phenosL$IP12 ~ x)$coefficients[2]) -> GWAS_IP
apply(G,2,function(x) lm(phenosL$L ~ x)$coefficients[2]) -> GWAS_L
apply(G,2,function(x) lm(phenosL$X1 ~ x)$coefficients[2]) -> GWAS_X1
apply(G,2,function(x) lm(phenosL$X2 ~ x)$coefficients[2]) -> GWAS_X2
apply(G,2,function(x) lm(phenosL$Y ~ x)$coefficients[2]) -> GWAS_Y
data.frame(beta=beta_cov[,1],
           beta_hat_L=GWAS_L,
           beta_hat_IP=GWAS_IP,
           beta_hat_Y=GWAS_Y,
           beta_hat_X1=GWAS_X1,
           beta_hat_X2=GWAS_X2
           ) -> GWASL

How does GWAS do?

GWASL %>% ggplot(aes(x=beta,y=beta_hat_L)) + geom_point() +geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

GWASL %>% ggplot(aes(x=beta,y=beta_hat_IP)) + geom_point() +geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

GWASL %>% ggplot(aes(x=beta,y=beta_hat_X1)) + geom_point() +geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

GWASL %>% ggplot(aes(x=beta,y=beta_hat_X2)) + geom_point() +geom_smooth(method="lm") 
## `geom_smooth()` using formula = 'y ~ x'

GWASL %>% ggplot(aes(x=beta,y=beta_hat_Y)) + geom_point() +geom_smooth(method="lm") 
## `geom_smooth()` using formula = 'y ~ x'

GWASL %>% cor() %>% ggcorrplot(method="circle")

Better than when the entries are on D!